Load in required packages¶
In [1]:
import pandas as pd
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In [2]:
# Import CSV with renamed columns
df = pd.read_csv("bike_data.csv")
df.info()
#Check the summary statistics of the data
df.describe()
# Clean up some columns
#day/month/year
df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
df.info()
#combine date column and hour column
df["datetime"] = df["date"] + pd.to_timedelta(df["hour"], unit="h")
df
#pd.set_option("future.no_silent_downcasting", True)
# Similar to is_holiday, map is_functioning to True and False
df["is_holiday"] = df["is_holiday"].replace({"No Holiday": False,
"Holiday": True}).astype(bool)
df["is_functioning"] = df["is_functioning"].astype(bool)
df.info()
# Only keep observations where the system is functioning
df = df.query("is_functioning")
df.shape
# Print out the result
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8760 entries, 0 to 8759 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 8760 non-null object 1 n_rented_bikes 8760 non-null int64 2 hour 8760 non-null int64 3 temperature_celsius 8760 non-null float64 4 humidity_pct 8760 non-null int64 5 wind_speed_mps 8760 non-null float64 6 visibility_10m 8760 non-null int64 7 dew_point_temp_c 8760 non-null float64 8 solar_radiation 8760 non-null float64 9 rainfall_mm 8760 non-null float64 10 snowfall_cm 8760 non-null float64 11 season 8760 non-null object 12 is_holiday 8760 non-null object 13 is_functioning 8760 non-null object dtypes: float64(6), int64(4), object(4) memory usage: 958.3+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8760 entries, 0 to 8759 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 8760 non-null datetime64[ns] 1 n_rented_bikes 8760 non-null int64 2 hour 8760 non-null int64 3 temperature_celsius 8760 non-null float64 4 humidity_pct 8760 non-null int64 5 wind_speed_mps 8760 non-null float64 6 visibility_10m 8760 non-null int64 7 dew_point_temp_c 8760 non-null float64 8 solar_radiation 8760 non-null float64 9 rainfall_mm 8760 non-null float64 10 snowfall_cm 8760 non-null float64 11 season 8760 non-null object 12 is_holiday 8760 non-null object 13 is_functioning 8760 non-null object dtypes: datetime64[ns](1), float64(6), int64(4), object(3) memory usage: 958.3+ KB <class 'pandas.core.frame.DataFrame'> RangeIndex: 8760 entries, 0 to 8759 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 8760 non-null datetime64[ns] 1 n_rented_bikes 8760 non-null int64 2 hour 8760 non-null int64 3 temperature_celsius 8760 non-null float64 4 humidity_pct 8760 non-null int64 5 wind_speed_mps 8760 non-null float64 6 visibility_10m 8760 non-null int64 7 dew_point_temp_c 8760 non-null float64 8 solar_radiation 8760 non-null float64 9 rainfall_mm 8760 non-null float64 10 snowfall_cm 8760 non-null float64 11 season 8760 non-null object 12 is_holiday 8760 non-null bool 13 is_functioning 8760 non-null bool 14 datetime 8760 non-null datetime64[ns] dtypes: bool(2), datetime64[ns](2), float64(6), int64(4), object(1) memory usage: 906.9+ KB
Out[2]:
(8760, 15)
Visualize bike rentals over time¶
In [5]:
# Create a line plot of rented bikes over time
px.line(df,x="datetime",y="n_rented_bikes")
In [6]:
# Calculate the total number of rented bikes per day
df_day = df.groupby("date")["n_rented_bikes"].sum().reset_index()
by_day = df.groupby("date", as_index=False).agg({"n_rented_bikes": "sum"})
per_day = df.groupby("date")["n_rented_bikes"].sum()
# Create a line plot showing total number of bikes per day over time
In [7]:
# Copy the previous chain of manipulations and add season as a variable to group by
by_day_season = df.groupby(["date","season"], as_index=False).agg({"n_rented_bikes": "sum"})
# Copy the code for the previous line plot and map season to color
px.line(by_day_season, x="date", y="n_rented_bikes", color="season")
# Copy the code for the previous line plot and map season to color
Explore the relation between weather and rentals¶
In [18]:
#Query df to only keep observations at noon
noon_rides = df.query('hour == 12')
# Create a scatter plot showing temperature against number of rented bikes
# Create a subplot figure with 2 rows and 2 columns
fig = make_subplots(rows=2, cols=2, subplot_titles=("Temperature vs Rented Bikes (with Trendline)",
"Date vs Rented Bikes (colored by Temperature)",
"Temperature vs Rented Bikes (no Trendline)",
"Season vs Rented Bikes"
))
#Graph1: Scatter plot with trendline
fig1 = px.scatter(noon_rides, x='temperature_celsius', y='n_rented_bikes', trendline='lowess')
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig1.data[1], row=1, col=1)
#Graph2: Scatter plot with color-coded temperature
fig2 = px.scatter(df, x="datetime", y="n_rented_bikes", color="temperature_celsius")
fig.add_trace(fig2.data[0], row=1, col=2)
#Graph3:Simple scatter plot
fig3 = px.scatter(noon_rides, x="temperature_celsius", y="n_rented_bikes")
fig.add_trace(fig3.data[0], row=2, col=1)
#Graph4:Scatter plot with seasons
fig4 = px.scatter(df, x="season", y="n_rented_bikes")
fig.add_trace(fig4.data[0], row=2, col=2)
#Update layout
fig.update_layout(height=800, width=1000, title_text="Subplots of Rented Bikes Data", showlegend=True)
fig.show()
Explore typical daily usage pattern¶
In [22]:
# Calculate the average number of rented bikes per hour
time_of_day = df.groupby("hour", as_index=False).agg({"n_rented_bikes": "mean"})
# Create a bar chart showing the usage pattern
px.bar(time_of_day, x="hour", y = "n_rented_bikes")
In [23]:
# Copy and adapt the previous query to take into account the season
time_of_day_season = df.groupby(["hour", "season"], as_index=False).agg({"n_rented_bikes": "mean"})
# Copy and adapt the code for the previous bar chart to show usage pattern per season
px.bar(time_of_day_season, x="hour", y="n_rented_bikes", color="season", facet_col="season")
Extra: is New Year's Eve different?¶
In [24]:
# New Years dates
new_years_start = datetime(2017,12,31,12)
new_years_end = datetime(2018,1,1,12)
new_year = df.query("@new_years_start <= datetime <= @new_years_end")
px.bar(new_year,x = 'datetime', y = 'n_rented_bikes')
In [ ]: